from urllib.request import urlopen
from bs4 import BeautifulSoup
import os, pymysql, time

targetDir = '../../../resources/assets/php-chunked-xhtml/'
realTargetDirPath = os.path.realpath(targetDir)

filelist = os.listdir(realTargetDirPath)
filelist.sort()

# 数据库连接对象
dbConn = pymysql.connect('127.0.0.1', 'dengqihua', 'dengqihua520', 'phpdoc', charset = 'utf8mb4')

# 创建数据库游标
curserObj = dbConn.cursor()

i = 1
for file in filelist:
    print('开始执行：' + file)

    nowTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    filePath = realTargetDirPath + '/' + file

    #如果不是html文件，跳过
    if not os.path.isfile(filePath) or file.rpartition('.')[2] != 'html':
        continue

    fileObj = open(filePath, 'r', encoding = 'UTF-8')
    html = fileObj.read()
    fileObj.close()

    bs4Obj = BeautifulSoup(html, 'html.parser')

    # 抓取标题
    title = bs4Obj.head.title.get_text()

    # 抓取内容
    content = str(bs4Obj.find('hr').next_sibling)

    # 抓取关键字
    keyword = ''
    try:
        keyword = bs4Obj.find('h1').get_text()
    except AttributeError:
        try:
            keyword = bs4Obj.find('h2').get_text()
        except AttributeError:
            keyword = ''

    # 前一个链接
    prevId = 0
    prevList = bs4Obj.find('div', {'class':'prev'}).children
    for pchild in prevList:
        prevName = pchild['href'].rpartition('.')[0]

        if prevName:
            sql = 'SELECT id FROM documents WHERE file_name="' + prevName + '"'
            curserObj.execute(sql)
            res = curserObj.fetchone()
            if res != None:
                prevId = res[0]

    # 后一个链接
    nextId = 0
    nextList = bs4Obj.find('div', {'class':'next'}).children
    for nchild in nextList:
        nextName = nchild['href'].rpartition('.')[0]

        if nextName:
            sql = 'SELECT id FROM documents WHERE file_name="' + nextName + '"'
            curserObj.execute(sql)
            res = curserObj.fetchone()
            if res != None:
                nextId = res[0]

    # 父级链接
    upId = 0
    upList = bs4Obj.find('div', {'class':'up'}).children
    for uchild in upList:
        upName = uchild['href'].rpartition('.')[0]

        if upName:
            sql = 'SELECT id FROM documents WHERE file_name="' + upName + '"'
            curserObj.execute(sql)
            res = curserObj.fetchone()
            if res != None:
                upId = res[0]

    # 文件名
    fileName = file.rpartition('.')[0]
    if fileName == '':
        continue

    result = None
    try:
        selectSql = 'SELECT id FROM documents WHERE file_name = %s'
        curserObj.execute(selectSql, fileName)
        result = curserObj.fetchone()
    except:
        print('数据查询出错！')
        exit()

    if result != None:
        # 更新
        id = result[0]

        sql = "UPDATE documents SET title = %s, keyword = %s,prev_id = %s, next_id = %s, up_id = %s, update_time = %s WHERE id = %s"
        curserObj.execute(sql, (title, keyword, prevId, nextId, upId, nowTime, id))

        sql = "UPDATE documents_content SET content = %s WHERE id = %s"
        curserObj.execute(sql, (content, id))

        dbConn.commit()

        print((' ' * 5) + "更新成功！")
    else:
        # 添加
        sql = "INSERT INTO documents(file_name, title, keyword, prev_id, next_id, up_id, create_time, update_time) values(%s, %s, %s, %s, %s, %s, %s, %s)"

        curserObj.execute(sql, (fileName, title, keyword, prevId, nextId, upId, nowTime, nowTime))
        dbConn.commit()

        id = curserObj.lastrowid

        sql = "INSERT INTO documents_content(id, content) VALUES(%s, %s)"
        curserObj.execute(sql, (id, content))
        dbConn.commit()

        print((' ' * 5) + "添加成功！")

    i = i + 1

# 关闭游标
curserObj.close()
dbConn.close()


